Skip to content

Commit 9cf7363

Browse files
committed
Synchronize after every op
The previous commit loosened the synchronization barriers too much, and it caused some rare brain damage when using quants. With this change, we're now only using 3-4x fewer sychronization barriers than before. A new --trap flag has been introduced by this change, which enables the runtime to detect the precise moment a NaN appears anywhere in the code and then aborts with both a C++ backtrace and a GGML graph trace. Some jankiness with pledge() and /dev/tty is also fixed by this change.
1 parent bd8c0de commit 9cf7363

File tree

8 files changed

+298
-85
lines changed

8 files changed

+298
-85
lines changed

build/config.mk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
1313
INSTALL = install
1414

1515
ARFLAGS = rcsD
16-
CCFLAGS = -g -O3 -fexceptions
17-
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes
16+
CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
17+
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
1818
TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4
1919

2020
TMPDIR = o//tmp

llama.cpp/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
233233
if (arg == "--cli") {
234234
return true;
235235
}
236+
if (arg == "--trap") {
237+
FLAG_trap = true;
238+
FLAG_unsecure = true; // for better backtraces
239+
llamafile_trapping_enabled(+1);
240+
return true;
241+
}
236242
if (arg == "--unsecure") {
237243
FLAG_unsecure = true;
238244
return true;

llama.cpp/console.cpp

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
22
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
34
#include "console.h"
5+
46
#include <vector>
57
#include <iostream>
6-
78
#include <climits>
89
#include <sys/ioctl.h>
910
#include <unistd.h>
1011
#include <wchar.h>
12+
#include <cosmo.h>
1113
#include <stdio.h>
1214
#include <stdlib.h>
1315
#include <signal.h>
@@ -30,6 +32,7 @@ namespace console {
3032

3133
static bool advanced_display = false;
3234
static bool simple_io = true;
35+
static bool should_close_tty = false;
3336
static display_t current_display = reset;
3437
static FILE* out = stdout;
3538
static FILE* tty = nullptr;
@@ -40,19 +43,32 @@ static termios initial_state;
4043
//
4144

4245
void init(bool use_simple_io, bool use_advanced_display) {
43-
advanced_display = use_advanced_display;
46+
should_close_tty = false;
4447
simple_io = use_simple_io;
48+
advanced_display = use_advanced_display;
4549
if (!simple_io) {
46-
struct termios new_termios;
47-
tcgetattr(STDIN_FILENO, &initial_state);
48-
new_termios = initial_state;
49-
new_termios.c_lflag &= ~(ICANON | ECHO);
50-
new_termios.c_cc[VMIN] = 1;
51-
new_termios.c_cc[VTIME] = 0;
52-
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
5350
tty = fopen("/dev/tty", "w+e");
51+
if (tty) {
52+
should_close_tty = true;
53+
} else if (IsLinux() || IsOpenbsd()) {
54+
// this could happen because pledge() blocked us
55+
tty = fdopen(0, "w+e");
56+
}
5457
if (tty != nullptr) {
55-
out = tty;
58+
if (!tcgetattr(fileno(tty), &initial_state)) {
59+
out = tty;
60+
struct termios new_termios = initial_state;
61+
new_termios.c_lflag &= ~(ICANON | ECHO);
62+
new_termios.c_cc[VMIN] = 1;
63+
new_termios.c_cc[VTIME] = 0;
64+
tcsetattr(fileno(tty), TCSANOW, &new_termios);
65+
} else {
66+
simple_io = true;
67+
fclose(tty);
68+
tty = 0;
69+
}
70+
} else {
71+
simple_io = true;
5672
}
5773
}
5874
setlocale(LC_ALL, "");
@@ -64,11 +80,14 @@ void cleanup() {
6480
// Restore settings
6581
if (!simple_io) {
6682
if (tty != nullptr) {
67-
out = stdout;
68-
fclose(tty);
83+
fflush(tty);
84+
tcsetattr(fileno(tty), TCSANOW, &initial_state);
85+
if (should_close_tty) {
86+
fclose(tty);
87+
}
6988
tty = nullptr;
89+
out = stdout;
7090
}
71-
tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
7291
}
7392
}
7493

llama.cpp/ggml.c

Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ void ggml_print_backtrace(void) {
117117

118118
/*#define GGML_PERF*/
119119
#define GGML_DEBUG 0
120-
#define GGML_GELU_FP16
121-
#define GGML_GELU_QUICK_FP16
122-
#define GGML_SILU_FP16
120+
// #define GGML_GELU_FP16
121+
// #define GGML_GELU_QUICK_FP16
122+
// #define GGML_SILU_FP16
123123
// #define GGML_CROSS_ENTROPY_EXP_FP16
124124
// #define GGML_FLASH_ATTN_EXP_FP16
125125

@@ -2679,6 +2679,7 @@ static inline int ggml_up(int n, int m) {
26792679
struct ggml_context * ggml_init(struct ggml_init_params params) {
26802680
// make this function thread safe
26812681
ggml_critical_section_start();
2682+
llamafile_trapping_enabled(-1);
26822683

26832684
static bool is_first_call = true;
26842685

@@ -2750,6 +2751,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
27502751
if (ctx == NULL) {
27512752
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
27522753

2754+
llamafile_trapping_enabled(+1);
27532755
ggml_critical_section_end();
27542756

27552757
return NULL;
@@ -2781,6 +2783,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
27812783

27822784
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
27832785

2786+
llamafile_trapping_enabled(+1);
27842787
ggml_critical_section_end();
27852788

27862789
return ctx;
@@ -7105,19 +7108,12 @@ void ggml_syncthreads(struct ggml_compute_params * params) {
71057108
// ops must call this before accessing wdata
71067109
// otherwise overlapping threads on previous op might clobber
71077110
void *ggml_acquire_wdata(struct ggml_compute_params * params) {
7108-
if (params->limbo) {
7109-
ggml_syncthreads(params);
7110-
}
7111-
params->limbo = true;
71127111
return params->wdata;
71137112
}
71147113

71157114
// ops should call this after writing to wdata before reading
71167115
void ggml_release_wdata(struct ggml_compute_params * params) {
7117-
if (params->limbo) {
7118-
ggml_syncthreads(params);
7119-
params->limbo = false;
7120-
}
7116+
ggml_syncthreads(params);
71217117
}
71227118

71237119
////////////////////////////////////////////////////////////////////////////////
@@ -11531,12 +11527,9 @@ static void ggml_compute_forward_set_f32(
1153111527

1153211528
if (!inplace) {
1153311529
if (!params->ith) {
11534-
// memcpy needs to be synchronized across threads to avoid race conditions.
11535-
// => do it in INIT phase
11536-
memcpy(
11537-
((char *) dst->data),
11538-
((char *) src0->data),
11539-
ggml_nbytes(dst));
11530+
memcpy(((char *) dst->data),
11531+
((char *) src0->data),
11532+
ggml_nbytes(dst));
1154011533
}
1154111534
ggml_syncthreads(params);
1154211535
}
@@ -17759,35 +17752,6 @@ struct ggml_compute_state {
1775917752
enum ggml_status ec;
1776017753
};
1776117754

17762-
// returns true if `src` is direct dependency of `node`
17763-
static bool ggml_has_src(const struct ggml_tensor * node,
17764-
const struct ggml_tensor * src) {
17765-
for (int i = 0; i < GGML_MAX_SRC; ++i) {
17766-
if (node->src[i] == src) {
17767-
return true;
17768-
}
17769-
}
17770-
return false;
17771-
}
17772-
17773-
// returns true if `dest` depends on any outputs since `mark`
17774-
//
17775-
// - cgraph->nodes[dest] is about to be executed
17776-
// - syncthreads() last happened right before cgraph->nodes[mark] executed
17777-
// - syncthreads() has 1.8 µs overhead minimum on Apple M2 when nth == 12
17778-
//
17779-
static bool ggml_needs_barrier(const struct ggml_cgraph * cgraph, int mark, int dest) {
17780-
assert(mark >= 0);
17781-
assert(mark <= dest);
17782-
assert(dest < cgraph->n_nodes);
17783-
for (; mark < dest; ++mark) {
17784-
if (ggml_has_src(cgraph->nodes[dest], cgraph->nodes[mark])) {
17785-
return true;
17786-
}
17787-
}
17788-
return false;
17789-
}
17790-
1779117755
static thread_ret_t ggml_graph_compute_thread(void * data) {
1779217756
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1779317757
const struct ggml_cgraph * cgraph = state->shared->cgraph;
@@ -17799,11 +17763,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1779917763
/*.wsize =*/ cplan->work_size,
1780017764
/*.wdata =*/ cplan->work_data,
1780117765
/*.barrier =*/ &state->shared->barrier,
17802-
/*.limbo =*/ false,
1780317766
};
1780417767

1780517768
set_numa_thread_affinity(state->ith);
1780617769

17770+
#ifdef LLAMAFILE_DEBUG
17771+
if (FLAG_trap) {
17772+
llamafile_trapping_enabled(+1);
17773+
}
17774+
#endif
17775+
1780717776
#ifdef GGML_PERF
1780817777
int64_t start_cycles, start_time_us;
1780917778
if (!state->ith) {
@@ -17819,17 +17788,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1781917788
return 0;
1782017789
}
1782117790

17822-
if (ggml_needs_barrier(cgraph, mark, node_n)) {
17823-
ggml_syncthreads(&params);
17824-
mark = node_n;
17825-
}
17791+
#ifdef LLAMAFILE_DEBUG
17792+
llamafile_debug_op_index = node_n;
17793+
#endif
1782617794

1782717795
struct ggml_tensor *node = cgraph->nodes[node_n];
1782817796
params.nth = state->shared->n_threads;
1782917797
ggml_compute_forward(&params, node);
1783017798

17799+
// this barrier could potentially be eliminated in 15%+ of cases
17800+
// however, it would give rise to ghoulish errors w/ little gain
17801+
ggml_syncthreads(&params);
17802+
1783117803
#if GGML_PERF
17832-
ggml_syncthreads(&state->shared->barrier);
1783317804
if (!state->ith) {
1783417805
int64_t end_cycles = ggml_perf_cycles();
1783517806
int64_t end_time_us = ggml_perf_time_us();
@@ -17844,8 +17815,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1784417815
#endif
1784517816
}
1784617817

17847-
ggml_syncthreads(&params);
17848-
1784917818
return 0;
1785017819
}
1785117820

@@ -18075,6 +18044,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1807518044
};
1807618045
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
1807718046

18047+
#ifdef LLAMAFILE_DEBUG
18048+
llamafile_debug_graph = cgraph;
18049+
#endif
18050+
1807818051
// create thread pool
1807918052
if (n_threads > 1) {
1808018053
for (int j = 1; j < n_threads; ++j) {
@@ -18134,6 +18107,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1813418107

1813518108
// fprintf(stderr, "%6d barriers %6d ops\n", state_shared.barrier.phase, cgraph->n_nodes);
1813618109

18110+
#ifdef LLAMAFILE_DEBUG
18111+
llamafile_debug_graph = 0;
18112+
#endif
18113+
1813718114
return compute_status;
1813818115
}
1813918116

llama.cpp/ggml.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,6 @@ extern "C" {
678678
void * wdata;
679679

680680
struct ggml_barrier * barrier;
681-
bool limbo;
682681
};
683682

684683
GGML_API void ggml_syncthreads (struct ggml_compute_params *);

llama.cpp/main/main.cpp

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -184,20 +184,6 @@ int main(int argc, char ** argv) {
184184
return 1;
185185
}
186186

187-
if (!params.mmproj.empty() &&
188-
(!params.image.empty() ||
189-
params.prompt.find("<img src=\"") != std::string::npos)) {
190-
return llava_cli(argc, argv, &params);
191-
}
192-
193-
// TODO: Dump params ?
194-
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
195-
196-
// save choice to use color for later
197-
// (note for later: this is a slightly awkward choice)
198-
console::init(params.simple_io, params.use_color);
199-
atexit([]() { console::cleanup(); });
200-
201187
if (!FLAG_unsecure && !llamafile_has_gpu()) {
202188
// Enable pledge() security on Linux and OpenBSD.
203189
// - We do this *after* opening the log file for writing.
@@ -213,6 +199,7 @@ int main(int argc, char ** argv) {
213199
} else {
214200
promises = "stdio rpath tty";
215201
}
202+
__pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
216203
if (pledge(0, 0)) {
217204
LOG_TEE("warning: this OS doesn't support pledge() security\n");
218205
} else if (pledge(promises, 0)) {
@@ -221,6 +208,20 @@ int main(int argc, char ** argv) {
221208
}
222209
}
223210

211+
if (!params.mmproj.empty() &&
212+
(!params.image.empty() ||
213+
params.prompt.find("<img src=\"") != std::string::npos)) {
214+
return llava_cli(argc, argv, &params);
215+
}
216+
217+
// TODO: Dump params ?
218+
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
219+
220+
// save choice to use color for later
221+
// (note for later: this is a slightly awkward choice)
222+
console::init(!params.interactive || params.simple_io, params.use_color);
223+
atexit([]() { console::cleanup(); });
224+
224225
if (params.logits_all) {
225226
printf("\n************\n");
226227
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);

0 commit comments

Comments
 (0)